library(readr)
data <- read_csv("THE2021 2.csv")
#str(data)

1) Explore the dataset and correlations

I will you DataExplorer package which helps to explore data patterns fast.

library(DataExplorer)
introduce(data)
## # A tibble: 1 x 9
##    rows columns discrete_columns continuous_colu… all_missing_col…
##   <int>   <int>            <int>            <int>            <int>
## 1  1448      11                2                9                0
## # … with 4 more variables: total_missing_values <int>, complete_rows <int>,
## #   total_observations <int>, memory_usage <dbl>
plot_intro(data)

WOW! No missing values!

plot_histogram(data)

plot_density(data)

Distribution is not fine, must be scaled.

library(scales)
data[,3:11] <- as.data.frame(apply(data[,3:11], 2, rescale))
summary(data[,3:11])
##  scores_teaching   scores_research   scores_citations scores_industry_income
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000       
##  1st Qu.:0.08222   1st Qu.:0.04649   1st Qu.:0.2166   1st Qu.:0.02099       
##  Median :0.14148   Median :0.10865   Median :0.4373   Median :0.07646       
##  Mean   :0.19144   Mean   :0.17330   Mean   :0.4675   Mean   :0.18321       
##  3rd Qu.:0.24456   3rd Qu.:0.23486   3rd Qu.:0.7034   3rd Qu.:0.22639       
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000       
##  scores_international_outlook stats_number_students stats_student_staff_ratio
##  Min.   :0.0000               Min.   :0.00000       Min.   :0.0000           
##  1st Qu.:0.1618               1st Qu.:0.04148       1st Qu.:0.1044           
##  Median :0.3324               Median :0.07545       Median :0.1414           
##  Mean   :0.3869               Mean   :0.09787       Mean   :0.1628           
##  3rd Qu.:0.5725               3rd Qu.:0.12727       3rd Qu.:0.1952           
##  Max.   :1.0000               Max.   :1.00000       Max.   :1.0000           
##  stats_pc_intl_students stats_female_share
##  Min.   :0.00000        Min.   :0.0000    
##  1st Qu.:0.02381        1st Qu.:0.4330    
##  Median :0.08333        Median :0.5361    
##  Mean   :0.13409        Mean   :0.5045    
##  3rd Qu.:0.20238        3rd Qu.:0.5876    
##  Max.   :1.00000        Max.   :1.0000

Scaling is successful! Let’s explore correlations:

First option – heatmap

corr <- cor(data[,3:11], method="pearson")
require(reshape2)
require(ggplot2)
ggplot(reshape2::melt(corr, varnames = c("x", "y"), value.name = "correlation"), 
       aes(x = x, y = y)) +
       geom_tile(aes(fill = correlation)) +
       scale_fill_gradient2(low = "green", mid = "yellow", high = "red",
       guide = guide_colorbar(ticks = FALSE, barheight = 5),
       limits = c(-1,1)) + 
       theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
       labs(title = "Heatmap of Correlation Matrix", x = NULL, y = NULL)

Second option – plot_correlation (love it more due to correlation numbers)

plot_correlation(data)

Well, scores_international_outlook and stats_pc_intl_students, scores_research and scores_teaching are highly correlated.

2) Run PCA using prcomp()

PCA itself:

pca_scores <- prcomp(data[,3:11], center = TRUE)
str(pca_scores, give.attr = F)
## List of 5
##  $ sdev    : num [1:9] 0.409 0.253 0.194 0.144 0.118 ...
##  $ rotation: num [1:9, 1:9] -0.286 -0.378 -0.573 -0.296 -0.543 ...
##  $ center  : Named num [1:9] 0.191 0.173 0.467 0.183 0.387 ...
##  $ scale   : logi FALSE
##  $ x       : num [1:1448, 1:9] -1.33 -1.26 -1.08 -1.31 -1.35 ...

3) Is it possible to produce an acceptable PCA solution on these data? Answer the question and motivate your answer.

library(dplyr)

summary(pca_scores) # Proportion of variance explained
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     0.4095 0.2532 0.1944 0.14402 0.11831 0.10281 0.08035
## Proportion of Variance 0.5109 0.1953 0.1151 0.06321 0.04266 0.03221 0.01968
## Cumulative Proportion  0.5109 0.7063 0.8214 0.88462 0.92728 0.95949 0.97917
##                            PC8     PC9
## Standard deviation     0.06835 0.04652
## Proportion of Variance 0.01424 0.00659
## Cumulative Proportion  0.99341 1.00000
pca_scores$sdev ^ 2 # Eigenvalue > 1
## [1] 0.167650766 0.064097961 0.037774093 0.020740957 0.013998367 0.010570108
## [7] 0.006456299 0.004671878 0.002163861
pca.var <- pca_scores$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1) # scree plot
barplot(pca.var.per, 
        main = "Scree Plot", 
        xlab = "Principal Component", 
        ylab = "Percent Variation")

According to the importance of components, we can produce an acceptable PCA solution on these data, since with the 9 components we would be able to account for 100% of total variance in the data(looking at cumulative proportion). Moreover, with only PC1 accounts for >51% of total variance overall. PC2 explains >19%, PC3 explains >11% of the variance.

screeplot(pca_scores, type = "l", npcs = 9, main = "Screeplot of the first 9 PCs")
abline(h = 1, col="red", lty=5)
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

cumpro <- cumsum(pca_scores$sdev^2 / sum(pca_scores$sdev^2))
plot(cumpro[0:9], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 6, col="blue", lty=5)
abline(h = 0.88759, col="blue", lty=5)
legend("topleft", legend=c("Cut-off @ PC6"),
       col=c("blue"), lty=5, cex=0.6)

From the plots, my suggestion is to reduce dimensionality from 9 to 5 by losing only about 11% of the variance.

4) Describe the results of PCA, explain how much variance the first PCs explain, plot the result using package ‘pca3d’.

PC1 represents mostly stats_student_staff_ratio. PC2 represents scores_industry_income, stats_female_share. PC3 stands for stats_female_share(again) and scores_teaching. Pc4 represents stats_student_staff_ratio, stats_female_share(again), stats_pc_intl_students, stats_number_students,scores_teaching. PC5 represents stats_number_students, stats_pc_intl_students, stats_student_staff_ratio, stats_female_share(again).

library(pca3d)
pca2d(pca_scores)

5) Are the best universities located in the US and Canada? Create a binary variable for US and Canada vs the rest, and use it for colouring the PC plot. (8/10)

data = data %>% mutate(bin = 
                         case_when(
                           data$location == "United States" ~ 1,
                           data$location == "Canada" ~ 1,
                           TRUE ~ 0
                         ))
pca2d(pca_scores, group = data$bin)

6)* Add PC coordinates to the dataset and use t-test on PC1 for the binary variable you created.

pca_data <- data.frame(
  X=pca_scores$x[,1],
  test = data$bin)

t.test(pca_data$test, pca_data$X)
## 
##  Welch Two Sample t-test
## 
## data:  pca_data$test and pca_data$X
## t = 9.8551, df = 2815.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1111936 0.1664307
## sample estimates:
##    mean of x    mean of y 
## 1.388122e-01 3.964489e-18

So, we can reject the null hypothesis and say that there are differences in the means of two group exists.

7)* Make a ggplot of the PCA solution and turn it into a plotly graph.

library(stringr)
data$name = str_replace_all(data$name, "�", "") 
#there were some troubles with encoding due to which database has these strange things and ggplot was not able to draw a plot, i deleted them

pca_data <- data.frame(name=data[,1],
  X=pca_scores$x[,1],
  Y=pca_scores$x[,2])
#pca_data$location = as.factor(pca_data$location)
ggplot(data = pca_data, aes(x = X, y = Y, label = name)) +
  geom_text() +
  xlab(paste("PC1 - ", pca.var.per[1], "%", sep = "")) +
  ylab(paste("PC2 - ", pca.var.per[2], "%", sep = "")) +
  theme_bw() +
  ggtitle("My PCA Graph")

To be honest, this looks like a black blob… Turning into plotly:

library(plotly)
ggplotly(p = ggplot2::last_plot())